import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objects as go
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import panel as pn
pn.extension('tabulator')
import warnings
warnings.filterwarnings('ignore')
# choropleth map
import plotly.express as px
from urllib.request import urlopen
import json
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
epa_prep = pd.read_csv('./input/epa_preprocessed.csv')
epa_prep.shape
(262888, 30)
epa_prep.drop(columns='Unnamed: 0', inplace=True)
epa_prep['date'] = pd.to_datetime(epa_prep['date'])
epa_prep.dtypes
sensor_id object sensor_lat float64 sensor_long float64 county object state object cbsa object color float64 income float64 language float64 education float64 date datetime64[ns] temp float64 humidity float64 wind_speed float64 wind_direction float64 pm25 float64 ozone float64 no2 float64 co float64 so2 float64 lead float64 benzene float64 cbsa0 object cbsa1 object year int64 month int64 day int64 weekday int64 geometry object dtype: object
# snesor_id: cbsa map
sensor_cbsa_dict = epa_prep[['sensor_id','cbsa']].set_index('sensor_id').to_dict()['cbsa']
sensor_cbsa_dict0 = epa_prep[['sensor_id','cbsa0']].set_index('sensor_id').to_dict()['cbsa0']
sensor_cbsa_dict1 = epa_prep[['sensor_id','cbsa1']].set_index('sensor_id').to_dict()['cbsa1']
sensor_state_dict = epa_prep[['sensor_id','state']].set_index('sensor_id').to_dict()['state']
# example dict call
sensor_cbsa_dict['01-033-1002']
'Florence-Muscle Shoals, AL'
# check for missing values in sensor related data
key_cols = ['date','sensor_id','sensor_lat', 'sensor_long','geometry','cbsa', 'ozone','pm25','color','education','language','income','temp']
epa_prep[key_cols].isnull().sum()
date 0 sensor_id 0 sensor_lat 0 sensor_long 0 geometry 0 cbsa 0 ozone 0 pm25 0 color 91 education 91 language 91 income 91 temp 118340 dtype: int64
Great! No missing values in sensor related data.
epa = epa_prep[key_cols]
epa['date'] = pd.to_datetime(epa['date'])
epa.head(3)
| date | sensor_id | sensor_lat | sensor_long | geometry | cbsa | ozone | pm25 | color | education | language | income | temp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-03-01 | 01-003-0010 | 30.497478 | -87.880258 | POINT (-87.880258 30.497478) | Daphne-Fairhope-Foley, AL | 0.029 | 3.8 | 0.13 | 0.03 | 0.0 | 0.26 | NaN |
| 1 | 2019-03-04 | 01-003-0010 | 30.497478 | -87.880258 | POINT (-87.880258 30.497478) | Daphne-Fairhope-Foley, AL | 0.034 | 6.5 | 0.13 | 0.03 | 0.0 | 0.26 | NaN |
| 2 | 2019-03-07 | 01-003-0010 | 30.497478 | -87.880258 | POINT (-87.880258 30.497478) | Daphne-Fairhope-Foley, AL | 0.053 | 8.6 | 0.13 | 0.03 | 0.0 | 0.26 | NaN |
epa.set_index(epa.date, inplace=True)
epa = epa.drop(columns='date')
epa.head(2)
| sensor_id | sensor_lat | sensor_long | geometry | cbsa | ozone | pm25 | color | education | language | income | temp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | ||||||||||||
| 2019-03-01 | 01-003-0010 | 30.497478 | -87.880258 | POINT (-87.880258 30.497478) | Daphne-Fairhope-Foley, AL | 0.029 | 3.8 | 0.13 | 0.03 | 0.0 | 0.26 | NaN |
| 2019-03-04 | 01-003-0010 | 30.497478 | -87.880258 | POINT (-87.880258 30.497478) | Daphne-Fairhope-Foley, AL | 0.034 | 6.5 | 0.13 | 0.03 | 0.0 | 0.26 | NaN |
# count of unique values
epa.nunique().sort_values(ascending=False).to_frame()
| 0 | |
|---|---|
| temp | 13758 |
| pm25 | 1045 |
| sensor_id | 525 |
| sensor_lat | 525 |
| sensor_long | 525 |
| geometry | 525 |
| cbsa | 306 |
| ozone | 119 |
| color | 99 |
| income | 89 |
| education | 59 |
| language | 38 |
sensor id, lat, long have the same 525 unique values. This means that we can group by 'sensor_id' using median without changing the values of sensor_lat/long.
# import fips file
fips_df = pd.read_csv('./input/cbsa_fips.csv')
fips_df.head(1).T
fips_df1 = fips_df[['cbsacode','cbsatitle','statename','fipsstatecode','fipscountycode']].dropna()
fips_df2 = fips_df1[['cbsacode','fipsstatecode','fipscountycode']].astype(int)
fips_df3 = fips_df1[['cbsatitle','statename']]
fips = pd.merge(fips_df3, fips_df2, left_on=fips_df3.index, right_on=fips_df2.index).drop(columns='key_0')
# add leading '0' to fips codes
fips['fipsstatecode']=fips['fipsstatecode'].apply(lambda x: '{0:0>2}'.format(x))
fips['fipscountycode']=fips['fipscountycode'].apply(lambda x: '{0:0>3}'.format(x))
fips['fips'] = fips['fipsstatecode'] + fips['fipscountycode']
# create {cbsa : fips} dict
fips_dict = fips.set_index('cbsatitle')['fips'].to_dict()
epa['fips'] = epa.cbsa.map(fips_dict)
epa.isnull().sum()
sensor_id 0 sensor_lat 0 sensor_long 0 geometry 0 cbsa 0 ozone 0 pm25 0 color 91 education 91 language 91 income 91 temp 118340 fips 25052 dtype: int64
epa.head()
| sensor_id | sensor_lat | sensor_long | geometry | cbsa | ozone | pm25 | color | education | language | income | temp | fips | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||
| 2019-03-01 | 01-003-0010 | 30.497478 | -87.880258 | POINT (-87.880258 30.497478) | Daphne-Fairhope-Foley, AL | 0.029 | 3.8 | 0.13 | 0.03 | 0.0 | 0.26 | NaN | 01003 |
| 2019-03-04 | 01-003-0010 | 30.497478 | -87.880258 | POINT (-87.880258 30.497478) | Daphne-Fairhope-Foley, AL | 0.034 | 6.5 | 0.13 | 0.03 | 0.0 | 0.26 | NaN | 01003 |
| 2019-03-07 | 01-003-0010 | 30.497478 | -87.880258 | POINT (-87.880258 30.497478) | Daphne-Fairhope-Foley, AL | 0.053 | 8.6 | 0.13 | 0.03 | 0.0 | 0.26 | NaN | 01003 |
| 2019-03-10 | 01-003-0010 | 30.497478 | -87.880258 | POINT (-87.880258 30.497478) | Daphne-Fairhope-Foley, AL | 0.030 | 7.5 | 0.13 | 0.03 | 0.0 | 0.26 | NaN | 01003 |
| 2019-03-13 | 01-003-0010 | 30.497478 | -87.880258 | POINT (-87.880258 30.497478) | Daphne-Fairhope-Foley, AL | 0.046 | 8.8 | 0.13 | 0.03 | 0.0 | 0.26 | NaN | 01003 |
# get index with missing fips
epa = epa.reset_index()
epa = epa.set_index('sensor_id')
missing_fips_list = epa[epa.fips.isnull()]['cbsa'].unique().tolist()
missing_fips_list
['La Paz, AZ', 'Calaveras, CA', 'Colusa, CA', 'Bishop, CA', 'Siskiyou, CA', 'Delta, CO', 'Rio Blanco, CO', 'Randolph, IL', 'Montgomery, IA', 'Palo Alto, IA', 'Van Buren, IA', 'Neosho, KS', 'Trego, KS', 'Carter, KY', 'Perry, KY', 'Pike, KY', 'Hancock, ME', 'Garrett, MD', 'Kent, MD', 'Manistee, MI', 'Schoolcraft, MI', 'Becker, MN', 'Lake, MN', 'Cedar, MO', 'Fergus, MT', 'Phillips, MT', 'Powder River, MT', 'Richland, MT', 'Rosebud, MT', 'Essex, NY', 'Swain, NC', 'Burke, ND', 'Dunn, ND', 'McKenzie, ND', 'Mercer, ND', 'Dewey, OK', 'Nowata, OK', 'Greene, PA', 'Tioga, PA', 'Chesterfield, SC', 'Jackson, SD', 'Brewster, TX', 'Duchesne, UT', 'Ashland, WI', 'Forest, WI', 'Vilas, WI', 'Park, WY', 'Sublette, WY']
epa.head(1)
| date | sensor_lat | sensor_long | geometry | cbsa | ozone | pm25 | color | education | language | income | temp | fips | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sensor_id | |||||||||||||
| 01-003-0010 | 2019-03-01 | 30.497478 | -87.880258 | POINT (-87.880258 30.497478) | Daphne-Fairhope-Foley, AL | 0.029 | 3.8 | 0.13 | 0.03 | 0.0 | 0.26 | NaN | 01003 |
# {county : fips} dict
county_fips = pd.read_csv('./input/county_fips.csv', usecols=['FIPS','Name','State'])
county_fips['county'] = county_fips['Name'] + ', ' + county_fips['State']
county_fips_dict = county_fips[['county','FIPS']].set_index('county').to_dict()['FIPS']
county_fips
| FIPS | Name | State | county | |
|---|---|---|---|---|
| 0 | 1001 | Autauga | AL | Autauga, AL |
| 1 | 1003 | Baldwin | AL | Baldwin, AL |
| 2 | 1005 | Barbour | AL | Barbour, AL |
| 3 | 1007 | Bibb | AL | Bibb, AL |
| 4 | 1009 | Blount | AL | Blount, AL |
| ... | ... | ... | ... | ... |
| 3227 | 72151 | Yabucoa | PR | Yabucoa, PR |
| 3228 | 72153 | Yauco | PR | Yauco, PR |
| 3229 | 78010 | St. Croix | VI | St. Croix, VI |
| 3230 | 78020 | St. John | VI | St. John, VI |
| 3231 | 78030 | St. Thomas | VI | St. Thomas, VI |
3232 rows × 4 columns
missing_fips_index = epa[epa.fips.isnull()].index
missing_fips_index
Index(['04-012-8000', '04-012-8000', '04-012-8000', '04-012-8000',
'04-012-8000', '04-012-8000', '04-012-8000', '04-012-8000',
'04-012-8000', '04-012-8000',
...
'56-035-0101', '56-035-0101', '56-035-0101', '56-035-0101',
'56-035-0101', '56-035-0101', '56-035-0101', '56-035-0101',
'56-035-0101', '56-035-0101'],
dtype='object', name='sensor_id', length=25052)
epa.isnull().sum()
date 0 sensor_lat 0 sensor_long 0 geometry 0 cbsa 0 ozone 0 pm25 0 color 91 education 91 language 91 income 91 temp 118340 fips 25052 dtype: int64
epa[['fips']]
| fips | |
|---|---|
| sensor_id | |
| 01-003-0010 | 01003 |
| 01-003-0010 | 01003 |
| 01-003-0010 | 01003 |
| 01-003-0010 | 01003 |
| 01-003-0010 | 01003 |
| ... | ... |
| 72-021-0010 | 72151 |
| 72-021-0010 | 72151 |
| 72-021-0010 | 72151 |
| 72-021-0010 | 72151 |
| 72-021-0010 | 72151 |
262888 rows × 1 columns
epa['fips'] = np.where(epa.index.isin(missing_fips_index)==True, epa['cbsa'].map(county_fips_dict), epa['fips'])
epa.isnull().sum()
date 0 sensor_lat 0 sensor_long 0 geometry 0 cbsa 0 ozone 0 pm25 0 color 91 education 91 language 91 income 91 temp 118340 fips 798 dtype: int64
missing_section_index = epa[epa.fips.isnull()].index
missing_section_index
Index(['06-027-0002', '06-027-0002', '06-027-0002', '06-027-0002',
'06-027-0002', '06-027-0002', '06-027-0002', '06-027-0002',
'06-027-0002', '06-027-0002',
...
'06-027-1023', '06-027-1023', '06-027-1023', '06-027-1023',
'06-027-1023', '06-027-1023', '06-027-1023', '06-027-1023',
'06-027-1023', '06-027-1023'],
dtype='object', name='sensor_id', length=798)
# fill in missing section fips
section_fips_dict = {'Bishop, CA':'06798'}
# map fips to cbsa
epa['fips'] = np.where(epa.index.isin(missing_section_index)==True, epa['cbsa'].map(section_fips_dict), epa['fips'])
epa.isnull().sum()
date 0 sensor_lat 0 sensor_long 0 geometry 0 cbsa 0 ozone 0 pm25 0 color 91 education 91 language 91 income 91 temp 118340 fips 0 dtype: int64
# 525 unique sensors
sensor_list_grouped = epa.groupby('sensor_id').median().sort_values('income',ascending=False)
sensor_list_grouped = sensor_list_grouped.rename(columns={'temp':'median_temp'})
# count of mesured dates in each sensor
sensor_date_counts = epa.reset_index().groupby('sensor_id').nunique()['date'].to_frame()
# merge sensor list with count of dates
sensor_list = pd.merge(sensor_list_grouped, sensor_date_counts, left_on=sensor_list_grouped.index, right_on=sensor_date_counts.index).rename(columns={'key_0':'sensor_id','date':'date_counts'}).set_index('sensor_id')
sensor_list.sort_values('date_counts', ascending=False)
| sensor_lat | sensor_long | ozone | pm25 | color | education | language | income | median_temp | date_counts | |
|---|---|---|---|---|---|---|---|---|---|---|
| sensor_id | ||||||||||
| 55-079-0026 | 43.060975 | -87.913504 | 0.035 | 7.10 | 0.81 | 0.04 | 0.00 | 0.50 | 7.500000 | 726 |
| 29-510-0085 | 38.656429 | -90.198348 | 0.036 | 7.85 | 0.66 | 0.17 | 0.00 | 0.67 | 14.965278 | 724 |
| 42-071-0012 | 40.043833 | -76.112400 | 0.036 | 8.00 | 0.00 | 0.64 | 0.08 | 0.36 | NaN | 723 |
| 42-003-0008 | 40.465420 | -79.960757 | 0.037 | 7.50 | 0.09 | 0.06 | 0.02 | 0.22 | 12.916667 | 723 |
| 13-089-0002 | 33.687800 | -84.290500 | 0.034 | 7.80 | 1.00 | 0.10 | 0.00 | 0.23 | 17.951389 | 723 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17-031-0001 | 41.670992 | -87.732457 | 0.046 | 7.50 | 0.62 | 0.19 | 0.02 | 0.22 | NaN | 69 |
| 48-453-0014 | 30.354944 | -97.761803 | 0.034 | 5.60 | 0.10 | 0.05 | 0.06 | 0.13 | 16.041668 | 65 |
| 17-031-0076 | 41.751400 | -87.713488 | 0.044 | 6.60 | 0.92 | 0.28 | 0.11 | 0.18 | NaN | 61 |
| 01-033-1002 | 34.762619 | -87.638097 | 0.041 | 7.40 | 0.40 | 0.02 | 0.00 | 0.18 | NaN | 50 |
| 42-063-0004 | 40.563330 | -78.919972 | 0.030 | 7.20 | 0.04 | 0.14 | 0.02 | 0.22 | NaN | 41 |
525 rows × 10 columns
# date count distribution
plt.figure(figsize=(3,3))
sensor_list['date_counts'].sort_values().plot(kind='box')
plt.title('525 sensors with count of measured dates')
plt.show()
# keep only sensors with more than 612 days of measured data
sensor_list.date_counts.quantile(0.5)
print(f'filtered data covers {round(612/365/2*100,0)}% dates from 2019-2020')
filtered data covers 84.0% dates from 2019-2020
# complete fips dict
fips_dict = epa[['fips']].to_dict()['fips']
sensor_list['fips'] = sensor_list.index.map(fips_dict)
sensor_list.isnull().sum()
sensor_lat 0 sensor_long 0 ozone 0 pm25 0 color 1 education 1 language 1 income 1 median_temp 236 date_counts 0 fips 0 dtype: int64
# apply dictionary
sensor_list['cbsa'] = sensor_list.index.map(sensor_cbsa_dict)
sensor_list['cbsa0'] = sensor_list.index.map(sensor_cbsa_dict0)
sensor_list['cbsa1'] = sensor_list.index.map(sensor_cbsa_dict1)
sensor_list['state'] = sensor_list.index.map(sensor_state_dict)
sensor_list
| sensor_lat | sensor_long | ozone | pm25 | color | education | language | income | median_temp | date_counts | fips | cbsa | cbsa0 | cbsa1 | state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sensor_id | |||||||||||||||
| 17-163-0010 | 38.612034 | -90.160477 | 0.0420 | 7.9 | 1.00 | 0.20 | 0.00 | 0.99 | NaN | 73 | 29510 | St. Louis, MO-IL | St. Louis | MO-IL | Illinois |
| 48-141-0044 | 31.765685 | -106.455227 | 0.0450 | 7.7 | 1.00 | 0.47 | 0.30 | 0.98 | 19.907410 | 612 | 48229 | El Paso, TX | El Paso | TX | Texas |
| 13-127-0006 | 31.169805 | -81.495035 | 0.0355 | 6.8 | 0.98 | 0.49 | 0.08 | 0.96 | NaN | 142 | 13191 | Brunswick, GA | Brunswick | GA | Georgia |
| 08-031-0026 | 39.779490 | -105.005180 | 0.0430 | 6.2 | 0.59 | 0.32 | 0.18 | 0.94 | 11.504629 | 631 | 08093 | Denver-Aurora-Lakewood, CO | Denver-Aurora-Lakewood | CO | Colorado |
| 37-147-0006 | 35.641276 | -77.360126 | 0.0405 | 6.0 | 0.64 | 0.24 | 0.00 | 0.91 | NaN | 368 | 37147 | Greenville, NC | Greenville | NC | North Carolina |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 22-073-0004 | 32.509959 | -92.046196 | 0.0350 | 6.5 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | 235 | 22111 | Monroe, LA | Monroe | LA | Louisiana |
| 49-045-0004 | 40.600532 | -112.353414 | 0.0440 | 4.7 | 0.02 | 0.00 | 0.00 | 0.00 | 10.717593 | 715 | 49045 | Salt Lake City, UT | Salt Lake City | UT | Utah |
| 06-075-0005 | 37.765946 | -122.399044 | 0.0310 | 7.0 | 0.28 | 0.00 | 0.00 | 0.00 | NaN | 716 | 06081 | San Francisco-Oakland-Hayward, CA | San Francisco-Oakland-Hayward | CA | California |
| 47-157-0075 | 35.151699 | -89.850249 | 0.0385 | 6.6 | 0.86 | 0.36 | 0.00 | 0.00 | 17.284723 | 232 | 47167 | Memphis, TN-MS-AR | Memphis | TN-MS-AR | Tennessee |
| 72-021-0010 | 18.420089 | -66.150615 | 0.0130 | 5.7 | NaN | NaN | NaN | NaN | NaN | 91 | 72151 | San Juan-Carolina-Caguas, PR | San Juan-Carolina-Caguas | PR | Puerto Rico |
525 rows × 15 columns
# pm 2.5 concentration map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
df = sensor_list
fig = px.choropleth(df, geojson=counties, locations='fips', color='pm25',
scope="usa",
hover_name='cbsa',
color_continuous_scale="rdbu_r", #Viridis
range_color=(0, 13),
labels={'pm25':'PM 2.5 ug/m3'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0},
title='PM 2.5 Concentration')
print('PM 2.5 Concentration:')
fig.show()
PM 2.5 Concentration:
# Ozone concentration map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
df = sensor_list
fig = px.choropleth(df, geojson=counties, locations='fips', color='ozone',
scope="usa",
hover_name='cbsa',
color_continuous_scale="rdbu_r", #Viridis
labels={'ozone':'Ozone ppm'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('Ozone Concentration:')
fig.show()
Ozone Concentration:
# people of color concentration map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
df = sensor_list
fig = px.choropleth(df, geojson=counties, locations='fips', color='color',
scope="usa",
hover_name='cbsa',
color_continuous_scale="solar_r", #Viridis
labels={'color':'People of Color %'}
# range_color=(0, 0.05),
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('People of Color %:')
fig.show()
People of Color %:
# Low income distribution map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
df = sensor_list
fig = px.choropleth(df, geojson=counties, locations='fips', color='income',
scope="usa",
hover_name='cbsa',
color_continuous_scale="rdpu", #Viridis
labels={'income':'Low Income %'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('Low Income Distribution:')
fig.show()
Low Income Distribution:
# Low education distribution map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
df = sensor_list
fig = px.choropleth(df, geojson=counties, locations='fips', color='education',
scope="usa",
hover_name='cbsa',
color_continuous_scale="rdpu", #Viridis
labels={'education':'Less than Highschool Education %'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('Low Education Distribution:')
fig.show()
Low Education Distribution:
# High language isolation distribution map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
df = sensor_list
fig = px.choropleth(df, geojson=counties, locations='fips', color='language',
scope="usa",
hover_name='cbsa',
color_continuous_scale="rdpu", #Viridis
range_color=(0,0.3),
labels={'language':'Language Isolation %'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('High Language Isolation Distribution:')
fig.show()
High Language Isolation Distribution:
# sensors with > 84% dates are covered within 2019-2020
# quantity of data reduced to 50% (525 sensors -> 262 sensors)
sensor_list_new = sensor_list[sensor_list.date_counts > sensor_list.date_counts.median()]
# apply dictionary
sensor_list_new['cbsa'] = sensor_list_new.index.map(sensor_cbsa_dict)
sensor_list_new['cbsa0'] = sensor_list_new.index.map(sensor_cbsa_dict0)
sensor_list_new['cbsa1'] = sensor_list_new.index.map(sensor_cbsa_dict1)
sensor_list_new['state'] = sensor_list_new.index.map(sensor_state_dict)
sensor_list_new.head(1)
| sensor_lat | sensor_long | ozone | pm25 | color | education | language | income | median_temp | date_counts | fips | cbsa | cbsa0 | cbsa1 | state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sensor_id | |||||||||||||||
| 08-031-0026 | 39.77949 | -105.00518 | 0.043 | 6.2 | 0.59 | 0.32 | 0.18 | 0.94 | 11.504629 | 631 | 08093 | Denver-Aurora-Lakewood, CO | Denver-Aurora-Lakewood | CO | Colorado |
# outliers in pm2.5, language, education
fig, axes = plt.subplots(1,6, figsize=(10,3))
sensor_list_new[['pm25']].boxplot(ax=axes[0])
sensor_list_new[['ozone']].boxplot(ax=axes[1])
sensor_list_new[['color']].boxplot(ax=axes[2])
sensor_list_new[['income']].boxplot(ax=axes[3])
sensor_list_new[['language']].boxplot(ax=axes[4])
sensor_list_new[['education']].boxplot(ax=axes[5])
plt.tight_layout()
plt.show()
# date count stats
print('525 sensors - max 726 dates, min 41 dates, 75% percentile of data has 690 days of data:')
display(sensor_list_new.date_counts.describe().astype(int).to_frame())
525 sensors - max 726 dates, min 41 dates, 75% percentile of data has 690 days of data:
| date_counts | |
|---|---|
| count | 262 |
| mean | 682 |
| std | 28 |
| min | 613 |
| 25% | 664 |
| 50% | 690 |
| 75% | 705 |
| max | 726 |
# plot sensor distribution - violin
fig, axes = plt.subplots(2,4, figsize=(10,6))
sns.violinplot(data= sensor_list_new, y='pm25', ax=axes[0,0])
sns.violinplot(data= sensor_list_new, y='ozone', ax=axes[0,1])
sns.violinplot(data= sensor_list_new, y='median_temp', ax=axes[0,2])
sns.violinplot(data= sensor_list_new, y='color', ax=axes[1,0])
sns.violinplot(data= sensor_list_new, y='income', ax=axes[1,1])
sns.violinplot(data= sensor_list_new, y='language', ax=axes[1,2])
sns.violinplot(data= sensor_list_new, y='education', ax=axes[1,3])
# plot sensor distribution - box
sensor_list_new[['pm25']].boxplot(ax=axes[0,0])
sensor_list_new[['ozone']].boxplot(ax=axes[0,1])
sensor_list_new[['median_temp']].boxplot(ax=axes[0,2])
sensor_list_new[['color']].boxplot(ax=axes[1,0])
sensor_list_new[['income']].boxplot(ax=axes[1,1])
sensor_list_new[['language']].boxplot(ax=axes[1,2])
sensor_list_new[['education']].boxplot(ax=axes[1,3])
plt.suptitle('Sensor Distribution')
plt.tight_layout()
plt.show()
# plot sensor distribution - swarm
fig, axes = plt.subplots(1,3, figsize=(8,3))
sns.swarmplot(data= sensor_list_new, y='pm25', ax=axes[0])
sns.swarmplot(data= sensor_list_new, y='ozone', ax=axes[1])
sns.swarmplot(data= sensor_list_new, y='median_temp', ax=axes[2])
plt.suptitle('Sensor distribution vs Pollutants vs Median temp')
plt.tight_layout()
plt.show()
# community characteristics
fig, axes = plt.subplots(1,4, figsize=(10,3))
sns.swarmplot(data= sensor_list_new, y='color', ax=axes[0])
sns.swarmplot(data= sensor_list_new, y='income', ax=axes[1])
sns.swarmplot(data= sensor_list_new, y='language', ax=axes[2])
sns.swarmplot(data= sensor_list_new, y='education', ax=axes[3])
plt.suptitle('Sensor distribution vs Community Characteristics')
plt.tight_layout()
plt.show()
We have a fair coverage of sensor measurement:
Most communities have:
EJ communities are mostly defined by:
poeple of color proportion > Low income proportion# sns.pairplot(sensor_list_new, kind='kde')
# plt.show()
corr = sensor_list_new.corr()
sns.heatmap(corr, cmap='coolwarm', annot=True, fmt='.1f')
plt.title('Bivariate Analysis')
plt.show()
let's perform PCA to identify
target = sensor_list_new[sensor_list_new.median_temp.isnull()==True]
train = sensor_list_new[sensor_list_new.median_temp.isnull()==False]
# train
train_X = train['sensor_lat'].values.reshape(len(train['median_temp']),1)
train_y = train['median_temp'].values.reshape(len(train['median_temp']),1)
# target (to predict)
target_X = target['sensor_lat'].values.reshape(len(target['median_temp']),1)
empty = []
target_y = pd.DataFrame(data=empty, index=target.index)#, columns='pred_median_temp')
train_X.shape, train_y.shape, target_X.shape, target_y.shape
((167, 1), (167, 1), (95, 1), (95, 0))
from sklearn.linear_model import LinearRegression
# fit model with sensor latitude vs median_temp
lr = LinearRegression()
lr.fit(train_X, train_y)
# intercept & coefficient of fitted model
coeffs = np.array(list(lr.intercept_.flatten()) + list(lr.coef_.flatten()))
coeffs = list(coeffs)
Almost perfect reverse linear relationship (slope = -1) between sensor_lat vs median_temp.
# train model r squared
print(f'Train model R-squared: {lr.score(train_X, train_y)}')
Train model R-squared: 0.7993042696605425
~80% of train data is explained by this linear model: y = 52.66 -1.01 x
# fill in missing values from linear regression model
pred = lr.predict(target_X)
target_y['pred_median_temp'] = pred
target_y
| pred_median_temp | |
|---|---|
| sensor_id | |
| 20-177-0013 | 13.194426 |
| 42-049-0003 | 10.041964 |
| 06-055-0004 | 13.948206 |
| 04-013-4003 | 18.878620 |
| 04-013-4005 | 18.870439 |
| ... | ... |
| 35-001-1012 | 17.076565 |
| 25-017-0010 | 9.566352 |
| 10-003-1007 | 12.661478 |
| 34-023-0011 | 11.740376 |
| 06-075-0005 | 14.466864 |
95 rows × 1 columns
df1 = sensor_list_new[np.isnan(sensor_list_new.median_temp)==False]
df1.shape
(167, 15)
df2 = sensor_list_new[np.isnan(sensor_list_new.median_temp)==True]
df2.drop(columns='median_temp', inplace=True)
df2['median_temp'] = pred
df2.shape
(95, 15)
# merge df1, df2
sensor_list_new = pd.concat([df1, df2])
sensor_list_new.shape
(262, 15)
# no more missing values
sensor_list_new.isnull().sum()
sensor_lat 0 sensor_long 0 ozone 0 pm25 0 color 0 education 0 language 0 income 0 median_temp 0 date_counts 0 fips 0 cbsa 0 cbsa0 0 cbsa1 0 state 0 dtype: int64
sensor_list_new.index
Index(['08-031-0026', '06-031-1004', '06-071-0306', '18-089-0022',
'06-099-0005', '40-031-0651', '06-037-1103', '19-163-0015',
'06-065-8001', '01-073-0023',
...
'42-129-0008', '46-127-0001', '06-013-0002', '50-007-0007',
'42-091-0013', '35-001-1012', '25-017-0010', '10-003-1007',
'34-023-0011', '06-075-0005'],
dtype='object', name='sensor_id', length=262)
df1.index
Index(['08-031-0026', '06-031-1004', '06-071-0306', '18-089-0022',
'06-099-0005', '40-031-0651', '06-037-1103', '19-163-0015',
'06-065-8001', '01-073-0023',
...
'06-111-2002', '25-015-4002', '32-031-1007', '56-021-0100',
'06-053-0002', '49-035-3013', '06-083-2011', '25-013-0008',
'39-035-0060', '49-045-0004'],
dtype='object', name='sensor_id', length=167)
# Linear Regression Plot
sns.lmplot(x='sensor_lat', y='median_temp', data=sensor_list_new)
plt.legend(['Predicted Temp','Regression Model'])
sns.scatterplot(x='sensor_lat', y='median_temp', data=sensor_list_new[sensor_list_new.index.isin(df1.index)], color='#FA8072' ,alpha=1.0)
plt.title('Actual & Predicted Median Temp vs Sensor Latitude')
plt.show()
corr = sensor_list_new.corr()
sns.heatmap(corr, cmap='coolwarm', annot=True, fmt='.1f')
plt.title('Bivariate Analysis')
plt.show()
sensor_list_new
| sensor_lat | sensor_long | ozone | pm25 | color | education | language | income | median_temp | date_counts | fips | cbsa | cbsa0 | cbsa1 | state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sensor_id | |||||||||||||||
| 08-031-0026 | 39.779490 | -105.005180 | 0.043 | 6.2 | 0.59 | 0.32 | 0.18 | 0.94 | 11.504629 | 631 | 08093 | Denver-Aurora-Lakewood, CO | Denver-Aurora-Lakewood | CO | Colorado |
| 06-031-1004 | 36.315670 | -119.643447 | 0.046 | 10.8 | 0.90 | 0.43 | 0.28 | 0.82 | 17.407408 | 695 | 06031 | Hanford-Corcoran, CA | Hanford-Corcoran | CA | California |
| 06-071-0306 | 34.510961 | -117.325540 | 0.049 | 7.9 | 0.83 | 0.20 | 0.16 | 0.81 | 16.578703 | 686 | 06071 | Riverside-San Bernardino-Ontario, CA | Riverside-San Bernardino-Ontario | CA | California |
| 18-089-0022 | 41.606662 | -87.304943 | 0.036 | 7.0 | 0.86 | 0.04 | 0.00 | 0.76 | 11.180556 | 643 | 55059 | Chicago-Naperville-Elgin, IL-IN-WI | Chicago-Naperville-Elgin | IL-IN-WI | Indiana |
| 06-099-0005 | 37.642165 | -120.994212 | 0.042 | 6.5 | 0.40 | 0.28 | 0.02 | 0.75 | 17.766205 | 671 | 06099 | Modesto, CA | Modesto | CA | California |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 35-001-1012 | 35.185200 | -106.508150 | 0.048 | 4.2 | 0.30 | 0.00 | 0.01 | 0.07 | 17.076565 | 707 | 35061 | Albuquerque, NM | Albuquerque | NM | New Mexico |
| 25-017-0010 | 42.612085 | -71.306986 | 0.033 | 6.5 | 0.20 | 0.03 | 0.04 | 0.05 | 9.566352 | 691 | 33017 | Boston-Cambridge-Newton, MA-NH | Boston-Cambridge-Newton | MA-NH | Massachusetts |
| 10-003-1007 | 39.551300 | -75.732000 | 0.037 | 6.7 | 0.28 | 0.05 | 0.00 | 0.04 | 12.661478 | 641 | 42101 | Philadelphia-Camden-Wilmington, PA-NJ-DE-MD | Philadelphia-Camden-Wilmington | PA-NJ-DE-MD | Delaware |
| 34-023-0011 | 40.462182 | -74.429439 | 0.037 | 6.8 | 0.44 | 0.08 | 0.04 | 0.04 | 11.740376 | 699 | 42103 | New York-Newark-Jersey City, NY-NJ-PA | New York-Newark-Jersey City | NY-NJ-PA | New Jersey |
| 06-075-0005 | 37.765946 | -122.399044 | 0.031 | 7.0 | 0.28 | 0.00 | 0.00 | 0.00 | 14.466864 | 716 | 06081 | San Francisco-Oakland-Hayward, CA | San Francisco-Oakland-Hayward | CA | California |
262 rows × 15 columns
sensor_list_new[['fips','color']].shape
(262, 2)
# Ozone concentration map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
df = sensor_list_new
color_max = df.ozone.max()
fig = px.choropleth(df, geojson=counties, locations='fips', color='ozone',
scope="usa",
hover_name='cbsa',
color_continuous_scale="rdbu_r", #Viridis
range_color=(0,color_max),
labels={'ozone':'Ozone ppm'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('Ozone Concentration:')
fig.show()
Ozone Concentration:
# pm 2.5 concentration map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
df = sensor_list_new
color_max = df.pm25.max()
fig = px.choropleth(df, geojson=counties, locations='fips', color='pm25',
scope="usa",
hover_name='cbsa',
color_continuous_scale="rdbu_r", #Viridis
range_color=(0,color_max),
labels={'pm25':'PM 2.5 ug/m3'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0},
title='PM 2.5 Concentration')
print('PM 2.5 Concentration:')
fig.show()
PM 2.5 Concentration:
# set up for all plots
df = sensor_list_new
df['text'] = '| State: ' + df['state'].astype(str) + ' | Color: ' + df['color'].astype(str) + ' | Income: ' + df['income'].astype(str) + ' | Education: ' + df['education'].astype(str) + ' | Ozone: ' + df['ozone'].astype(str)
# high people of color plot
df = sensor_list_new
fig = go.Figure(data=go.Scattergeo(
lon = df['sensor_long'],
lat = df['sensor_lat'],
mode = 'markers',
text = df['text'],
marker_color = df['color'],
marker = dict(
size = 8,
opacity = 0.8,
reversescale = False,
autocolorscale = False,
symbol = 'square',
line = dict(
width=1,
color='rgba(102, 102, 102)'
),
colorscale = 'ylgnbu',
cmin = 0,
cmax = df['color'].max(),
colorbar_title="People of Color %"
))
)
fig.update_layout(
title = 'High People of Color Distribution<br>Sensor Locations with > 84% dates covered in 2019-2020',
geo_scope='usa',
)
fig.show()
# people of color
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
# df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv",
# dtype={"fips": str})
df = sensor_list_new
color_max = df.color.max()
fig = px.choropleth(df, geojson=counties, locations='fips', color='color', #unemp
color_continuous_scale="ylgnbu",
hover_name='cbsa',
scope="usa",
range_color=(0,color_max),
labels={'color':'People of Color %'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('High People of Color Distribution:')
fig.show()
High People of Color Distribution:
# low income plot
df = sensor_list_new
fig = go.Figure(data=go.Scattergeo(
lon = df['sensor_long'],
lat = df['sensor_lat'],
mode = 'markers',
text = df['text'],
marker_color = df['income'],
marker = dict(
size = 8,
opacity = 0.8,
reversescale = False,
autocolorscale = False,
symbol = 'square',
line = dict(
width=1,
color='rgba(102, 102, 102)'
),
colorscale = 'hot_r',
cmin = 0,
cmax = df['income'].max(),
colorbar_title="Low income %"
))
)
fig.update_layout(
title = 'Low Income Distribution<br>Sensor Locations with > 84% dates covered in 2019-2020',
geo_scope='usa',
)
fig.show()
# Low income distribution map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
df = sensor_list_new
color_max = df.income.max()
fig = px.choropleth(df, geojson=counties, locations='fips', color='income',
scope="usa",
hover_name='cbsa',
color_continuous_scale="hot_r", #Viridis
range_color=(0,color_max),
labels={'income':'Low Income %'},
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('Low Income Distribution:')
fig.show()
Low Income Distribution:
# low education plot
df = sensor_list_new
fig = go.Figure(data=go.Scattergeo(
lon = df['sensor_long'],
lat = df['sensor_lat'],
mode = 'markers',
text = df['text'],
marker_color = df['education'],
marker = dict(
size = 8,
opacity = 0.8,
reversescale = False,
autocolorscale = False,
symbol = 'square',
line = dict(
width=1,
color='rgba(102, 102, 102)'
),
colorscale = 'gray_r',
cmin = 0,
cmax = df['education'].max(),
colorbar_title="Low education %"
))
)
fig.update_layout(
title = 'Low Education Distribution<br>Sensor Locations with > 84% dates covered in 2019-2020',
geo_scope='usa',
)
fig.show()
# Low education distribution map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
df = sensor_list_new
color_max = df.education.max()
fig = px.choropleth(df, geojson=counties, locations='fips', color='education',
scope="usa",
hover_name='cbsa',
color_continuous_scale="gray_r", #Viridis
labels={'education':'Less than Highschool Education %'},
range_color=(0, color_max)
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('Low Education Distribution:')
fig.show()
Low Education Distribution:
# low language literacy plot
df = sensor_list_new
fig = go.Figure(data=go.Scattergeo(
lon = df['sensor_long'],
lat = df['sensor_lat'],
mode = 'markers',
text = df['text'],
marker_color = df['language'],
marker = dict(
size = 8,
opacity = 0.8,
reversescale = False,
autocolorscale = False,
symbol = 'square',
line = dict(
width=1,
color='rgba(102, 102, 102)'
),
colorscale = 'hot_r',
cmin = 0,
cmax = df['language'].max(),
colorbar_title="Low Language Literacy %"
))
)
fig.update_layout(
title = 'Low Language Literacy Distribution<br>Sensor Locations with > 84% dates covered in 2019-2020',
geo_scope='usa',
)
fig.show()
# High language isolation distribution map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
df = sensor_list_new
color_max = df.language.max()
fig = px.choropleth(df, geojson=counties, locations='fips', color='language',
scope="usa",
hover_name='cbsa',
color_continuous_scale="hot_r", #Viridis
range_color=(0,color_max),
labels={'language':'Language Isolation %'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('High Language Isolation Distribution:')
fig.show()
High Language Isolation Distribution: